library(tidyverse)
library(GGally)
library(cowplot)
library(class)
library(caret)
library(e1071)
library(reshape2)
library(stringr)
library(naniar)
library(skimr) #for data summary
Rcolors <- c("lightseagreen", "darkslateblue","orange")
# The dataset has 870 observations (rows) and 36 variables (columns).
employee.df <- read.csv("CaseStudy2-data.csv", header = T)
str(employee.df)
## 'data.frame': 870 obs. of 36 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : chr "No" "No" "No" "No" ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : chr "Sales" "Research & Development" "Research & Development" "Sales" ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : chr "Life Sciences" "Medical" "Life Sciences" "Marketing" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : chr "Sales Executive" "Research Director" "Manufacturing Director" "Sales Executive" ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : chr "Divorced" "Single" "Single" "Married" ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "No" "No" "No" "No" ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
skim(employee.df)
| Name | employee.df |
| Number of rows | 870 |
| Number of columns | 36 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 27 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Attrition | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| BusinessTravel | 0 | 1 | 10 | 17 | 0 | 3 | 0 |
| Department | 0 | 1 | 5 | 22 | 0 | 3 | 0 |
| EducationField | 0 | 1 | 5 | 16 | 0 | 6 | 0 |
| Gender | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| JobRole | 0 | 1 | 7 | 25 | 0 | 9 | 0 |
| MaritalStatus | 0 | 1 | 6 | 8 | 0 | 3 | 0 |
| Over18 | 0 | 1 | 1 | 1 | 0 | 1 | 0 |
| OverTime | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| ID | 0 | 1 | 435.50 | 251.29 | 1 | 218.25 | 435.5 | 652.75 | 870 | ▇▇▇▇▇ |
| Age | 0 | 1 | 36.83 | 8.93 | 18 | 30.00 | 35.0 | 43.00 | 60 | ▂▇▇▃▂ |
| DailyRate | 0 | 1 | 815.23 | 401.12 | 103 | 472.50 | 817.5 | 1165.75 | 1499 | ▇▇▇▇▇ |
| DistanceFromHome | 0 | 1 | 9.34 | 8.14 | 1 | 2.00 | 7.0 | 14.00 | 29 | ▇▅▂▂▂ |
| Education | 0 | 1 | 2.90 | 1.02 | 1 | 2.00 | 3.0 | 4.00 | 5 | ▂▅▇▆▁ |
| EmployeeCount | 0 | 1 | 1.00 | 0.00 | 1 | 1.00 | 1.0 | 1.00 | 1 | ▁▁▇▁▁ |
| EmployeeNumber | 0 | 1 | 1029.83 | 604.79 | 1 | 477.25 | 1039.0 | 1561.50 | 2064 | ▇▇▇▇▇ |
| EnvironmentSatisfaction | 0 | 1 | 2.70 | 1.10 | 1 | 2.00 | 3.0 | 4.00 | 4 | ▅▆▁▇▇ |
| HourlyRate | 0 | 1 | 65.61 | 20.13 | 30 | 48.00 | 66.0 | 83.00 | 100 | ▇▇▆▇▇ |
| JobInvolvement | 0 | 1 | 2.72 | 0.70 | 1 | 2.00 | 3.0 | 3.00 | 4 | ▁▃▁▇▁ |
| JobLevel | 0 | 1 | 2.04 | 1.09 | 1 | 1.00 | 2.0 | 3.00 | 5 | ▇▇▃▂▁ |
| JobSatisfaction | 0 | 1 | 2.71 | 1.11 | 1 | 2.00 | 3.0 | 4.00 | 4 | ▅▅▁▇▇ |
| MonthlyIncome | 0 | 1 | 6390.26 | 4597.70 | 1081 | 2839.50 | 4945.5 | 8182.00 | 19999 | ▇▅▂▁▁ |
| MonthlyRate | 0 | 1 | 14325.62 | 7108.38 | 2094 | 8092.00 | 14074.5 | 20456.25 | 26997 | ▇▇▇▇▇ |
| NumCompaniesWorked | 0 | 1 | 2.73 | 2.52 | 0 | 1.00 | 2.0 | 4.00 | 9 | ▇▃▂▂▁ |
| PercentSalaryHike | 0 | 1 | 15.20 | 3.68 | 11 | 12.00 | 14.0 | 18.00 | 25 | ▇▅▃▂▁ |
| PerformanceRating | 0 | 1 | 3.15 | 0.36 | 3 | 3.00 | 3.0 | 3.00 | 4 | ▇▁▁▁▂ |
| RelationshipSatisfaction | 0 | 1 | 2.71 | 1.10 | 1 | 2.00 | 3.0 | 4.00 | 4 | ▅▅▁▇▇ |
| StandardHours | 0 | 1 | 80.00 | 0.00 | 80 | 80.00 | 80.0 | 80.00 | 80 | ▁▁▇▁▁ |
| StockOptionLevel | 0 | 1 | 0.78 | 0.86 | 0 | 0.00 | 1.0 | 1.00 | 3 | ▇▇▁▂▁ |
| TotalWorkingYears | 0 | 1 | 11.05 | 7.51 | 0 | 6.00 | 10.0 | 15.00 | 40 | ▇▇▂▁▁ |
| TrainingTimesLastYear | 0 | 1 | 2.83 | 1.27 | 0 | 2.00 | 3.0 | 3.00 | 6 | ▂▇▇▂▃ |
| WorkLifeBalance | 0 | 1 | 2.78 | 0.71 | 1 | 2.00 | 3.0 | 3.00 | 4 | ▁▃▁▇▂ |
| YearsAtCompany | 0 | 1 | 6.96 | 6.02 | 0 | 3.00 | 5.0 | 10.00 | 40 | ▇▃▁▁▁ |
| YearsInCurrentRole | 0 | 1 | 4.20 | 3.64 | 0 | 2.00 | 3.0 | 7.00 | 18 | ▇▃▂▁▁ |
| YearsSinceLastPromotion | 0 | 1 | 2.17 | 3.19 | 0 | 0.00 | 1.0 | 3.00 | 15 | ▇▁▁▁▁ |
| YearsWithCurrManager | 0 | 1 | 4.14 | 3.57 | 0 | 2.00 | 3.0 | 7.00 | 17 | ▇▂▅▁▁ |
NoSalary <- read.csv("CaseStudy2CompSet(NoSalary).csv", header = T)
NoAttrition = read.csv("CaseStudy2CompSet No Attrition.csv", header = TRUE)
#Looking for missing values in dataset
gg_miss_var(employee.df, show_pct = T) + labs(title = 'Percent Missing Values') + theme(plot.title = element_text(hjust = 0.5))
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
# No missing values are found for any variables (columns) in the dataset.
employee.df %>% ggplot(aes(x = Attrition, fill = Attrition)) +
geom_bar() +
ggtitle("Distribution of Attrition") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(values = Rcolors)
ggplot(employee.df, aes(x=MonthlyIncome, fill = Attrition)) +
geom_histogram() +
ggtitle("Monthly Income Based on Attrition") +
xlab("Monthly Income") +
ylab("Count") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(values = Rcolors)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# We see from the histogram that most number of employees have salary in range 2000-4000 and then second highest range is 4000-6000. Let's create income groups and look at Attrition for those income groups.
employee.df$IncomeGroup <- cut(employee.df$MonthlyIncome, breaks = c(0, 2000, 4000, 6000, 10000, 15000, 20000), labels = c("<$2000","$2000-$4000","$4000-$6000","$6000-$10000","$10000-$15000","$15000-$20000"))
employee.df %>% ggplot(aes_string(x = "IncomeGroup", fill = "Attrition")) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Income Group", y = "Percent Employees", title = "Attrition for Income Groups") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(values = Rcolors)
# Lowest Income Group (<$2000) has the highest Attrition rate, followed by second lowest Income Group ($2000-$4000).
# Let's create age groups and look at Attrition for those age groups.
employee.df$AgeGroup <- cut(employee.df$Age, breaks = c(18,25,35,45,60), labels = c("18-25","25-35","35-45","45-60"), include.lowest = TRUE)
employee.df %>% ggplot(aes_string(x = "AgeGroup", fill = "Attrition")) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Age Group", y = "Percent Employees", title = "Attrition for Age Groups") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(values = Rcolors)
# Younger age groups 18-25 and 25-35 have higher attrition rate.
employee.df %>% ggplot(aes_string(x = "JobRole", fill = "Attrition")) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Job Role", y = "Percent Employees", title = "Attrition for Job Role") +
theme(plot.title = element_text(hjust = 0.5)) +
coord_flip() +
scale_x_discrete(labels = function(x) str_wrap(x, width = 12)) +
scale_fill_manual(values = Rcolors)
# Research Directors have highest attrition rate whereas Sales Representatives have the least.
ggplot(employee.df, aes(x=MonthlyIncome, fill = Attrition)) +
geom_histogram() +
facet_wrap(~JobRole) +
ggtitle("Monthly Income based on Job Role") +
xlab("Monthly Income") +
ylab("Count") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(values = Rcolors)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(employee.df, aes(x=JobSatisfaction, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
ggtitle("Attrition based on Job Role and Job Satisfaction") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(values = Rcolors) +
facet_wrap(~JobRole)
ggplot(employee.df, aes(x=Age, y=MonthlyIncome, color = Attrition)) +
geom_point() +
facet_wrap(~JobRole) +
ggtitle("Attrition based on Age and Monthly Income") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = Rcolors)
employee.df %>% ggplot(aes_string(x = "OverTime", fill = "Attrition")) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Overtime", y = "Percent Employees", title = "Attrition for Overtime") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(values = Rcolors)
# Higher attrition rate for employees who work overtime
employee.df %>% ggplot(aes_string(x = "BusinessTravel", fill = "Attrition")) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Business Travel", y = "Percent Employees", title = "Attrition for Business Travel") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(values = Rcolors)
# Attrition level goes up from Non-Travel to Travel-Frequently category.
# divorce contribute to attrition
bar <- ggplot(employee.df, aes(x=MaritalStatus, fill = MaritalStatus)) +
geom_bar() +
facet_wrap(~Attrition) +
ggtitle("Attrition Dependent on Marital Status ") +
xlab("Attrition") +
ylab("Count")
bar <- bar + scale_fill_manual(values = Rcolors)
# Yes Attrition
# make new data frames for married, divorced, and single
df1 <- employee.df %>%
filter(Attrition == "Yes")
df1length <- NROW(df1)
df1married <- length(grep("Married", df1$MaritalStatus))
df1divorced <- length(grep("Divorced", df1$MaritalStatus))
df1single <- length(grep("Single", df1$MaritalStatus))
Yes_df <- data.frame(MaritalStatus = c("Married","Divorced","Single"),
Value = c(df1married,df1divorced,df1single))
plot1 <- ggplot(Yes_df, aes(x="", y=Value, fill=MaritalStatus)) +
geom_bar(width = 1, stat = "identity") + xlab("") + ylab("") + ggtitle("Attrition: Yes")
pie1 <- plot1 + coord_polar("y", start=0)
pie1 <- pie1 + scale_fill_manual(values = Rcolors)
# No Attrition
# make new data frames for married, divorced, and single
df2 <- employee.df %>%
filter(Attrition == "No")
df2length <- NROW(df2)
df2married <- length(grep("Married", df2$MaritalStatus))
df2divorced <- length(grep("Divorced", df2$MaritalStatus))
df2single <- length(grep("Single", df2$MaritalStatus))
No_df <- data.frame(MaritalStatus = c("Married","Divorced","Single"),
Value = c(df2married,df2divorced,df2single))
plot2 <- ggplot(No_df, aes(x="", y=Value, fill=MaritalStatus)) +
geom_bar(width = 1, stat = "identity") + xlab("") + ylab("") + ggtitle("Attrition: No")
pie2 <- plot2 + coord_polar("y", start=0)
pie2 <- pie2 + scale_fill_manual(values = Rcolors)
piecharts <- plot_grid(pie2,pie1, ncol = 2, labels = c("B","C"))
allplots <- plot_grid(bar,piecharts, nrow = 2, labels = "A")
allplots
EDA1 <- ggplot(employee.df, aes(x=RelationshipSatisfaction, fill = Attrition)) +
geom_bar(position = "fill") +
#facet_wrap(~JobRole) +
ggtitle("Relationship Satisfaction") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA1 <- EDA1 + theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA2 <- ggplot(employee.df, aes(x=JobLevel, fill = Attrition)) +
geom_bar(position = "fill") +
#facet_wrap(~JobRole) +
ggtitle("Job Level") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA2 <- EDA2 + theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA3 <- ggplot(employee.df, aes(x=JobSatisfaction, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Job Satisfaction") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA3 <- EDA3 + theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA4 <- ggplot(employee.df, aes(x=YearsWithCurrManager, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years With Current Manager") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA4 <- EDA4 + theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA5 <- ggplot(employee.df, aes(x=YearsSinceLastPromotion, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years Since Last Promotion") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA5 <- EDA5 + theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA6 <- ggplot(employee.df, aes(x=YearsAtCompany, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years At Company") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA6 <- EDA6 + theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA7 <- ggplot(employee.df, aes(x=YearsInCurrentRole, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years in Current Role") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA7 <- EDA7 + theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA8 <- ggplot(employee.df, aes(x=NumCompaniesWorked, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Companies Worked") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA8 <- EDA8 + theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA9 <- ggplot(employee.df, aes(x=MaritalStatus, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Marital Status") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA9 <- EDA9 + theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA10 <- plot_grid(EDA1,EDA2,EDA3,EDA4,EDA5,EDA6,EDA7,EDA8,EDA9, ncol = 3, nrow = 3)
EDA10
EDA.A <- ggplot(employee.df, aes(x=TotalWorkingYears, y=PercentSalaryHike, color = Attrition)) +
geom_point() +
ggtitle("Total Working Years vs % Salary Hike") +
scale_color_manual(values = Rcolors) +
ylab("% Salary Hike") +
theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA.B <- ggplot(employee.df, aes(x=YearsWithCurrManager, y=PercentSalaryHike, color = Attrition)) +
geom_point() +
ggtitle("Yrs With Manager vs % Salary Hike") +
scale_color_manual(values = Rcolors) +
ylab("% Salary Hike") +
xlab("Yrs With Manager") +
theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA.C <-ggplot(employee.df, aes(x=YearsAtCompany, y=YearsInCurrentRole, color = Attrition)) +
geom_point() +
ggtitle("Yrs At Company vs Yrs In Role") +
scale_color_manual(values = Rcolors) +
ylab("Yrs In Role") +
theme(plot.title = element_text(size = 12, hjust = 0.5))
# EDA.D <- ggplot(employee.df, aes(x=YearsSinceLastPromotion, y=NumCompaniesWorked, color = Attrition)) +
# geom_point() +
# ggtitle("Yrs Last Promotion vs Num Companies Worked") +
# scale_color_manual(values = Rcolors) +
# theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA.E <- ggplot(employee.df, aes(x=Age, y=YearsSinceLastPromotion, color = Attrition)) +
geom_point() +
ggtitle("Age vs Job Yrs Last Promotion") +
scale_color_manual(values = Rcolors) +
ylab("Yrs Last Promotion") +
theme(plot.title = element_text(size = 12, hjust = 0.5))
# EDA.F <- ggplot(employee.df, aes(x=Age, y=PercentSalaryHike, color = Attrition)) +
# geom_point() +
# ggtitle("Age vs % Salary Hike") +
# scale_color_manual(values = Rcolors) +
# ylab("% Salary Hike") +
# theme(plot.title = element_text(size = 12, hjust = 0.5))
EDA.G <- plot_grid(EDA.A,EDA.B,EDA.C,EDA.E, ncol = 2, nrow = 2)
EDA.G
set.seed(9)
NB.df <- select(employee.df, "Attrition", "Age", "Department", "EnvironmentSatisfaction", "JobInvolvement", "JobLevel", "JobRole", "JobSatisfaction", "MaritalStatus", "MonthlyIncome", "NumCompaniesWorked", "OverTime", "StockOptionLevel", "TotalWorkingYears", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", "YearsWithCurrManager")
NBsplitPerc <- .85
NBtrainIndices <- sample(1:dim(NB.df)[1],round(NBsplitPerc * dim(NB.df)[1]))
NBdfTrain <- NB.df[NBtrainIndices,]
NBdfTest <- NB.df[-NBtrainIndices,]
# run NB model
NBmodel = naiveBayes(Attrition~.,data = NBdfTrain)
confusionMatrix(table(predict(NBmodel,NBdfTest),NBdfTest$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 96 6
## Yes 10 18
##
## Accuracy : 0.8769
## 95% CI : (0.8078, 0.928)
## No Information Rate : 0.8154
## P-Value [Acc > NIR] : 0.0401
##
## Kappa : 0.616
##
## Mcnemar's Test P-Value : 0.4533
##
## Sensitivity : 0.9057
## Specificity : 0.7500
## Pos Pred Value : 0.9412
## Neg Pred Value : 0.6429
## Prevalence : 0.8154
## Detection Rate : 0.7385
## Detection Prevalence : 0.7846
## Balanced Accuracy : 0.8278
##
## 'Positive' Class : No
##
# make predictions with model
NBpreds = predict(NBmodel, NoAttrition)
NBpreds = data.frame(NBpreds)
NoAttrition$Attrition = NBpreds$NBpreds
Attrition.Classify = NoAttrition %>% select("ID", "Attrition") %>% arrange(ID)
#write.csv(Attrition.Classify, file = "Case2PredictionsBoyd_Pandey Attrition.csv", row.names=FALSE)
# plot counts of attrition
ggplot(Attrition.Classify, aes(x=Attrition, fill=Attrition)) +
geom_bar(fill = Rcolors[c(1,2)]) +
ggtitle("Predicted Attrition") +
ylab("Count") +
theme(plot.title = element_text(hjust = 0.5))
iterations = 100
masterAcc = matrix(nrow = iterations)
masterSens = matrix(nrow = iterations)
masterSpec = matrix(nrow = iterations)
splitPerc = .85 #Training / Test split Percentage
for(j in 1:iterations)
{
trainIndices = sample(1:dim(NB.df)[1],round(splitPerc * dim(NB.df)[1]))
train = NB.df[trainIndices,]
test = NB.df[-trainIndices,]
model = naiveBayes(train[,c(2:18)],train$Attrition)
table(predict(model,test[,c(2:18)]),test$Attrition)
CM = confusionMatrix(table(predict(model,test[,c(2:18)]),test$Attrition))
masterAcc[j] = CM$overall[1]
masterSens[j] = CM$byClass[1]
masterSpec[j] = CM$byClass[2]
}
# accuracy measurements
MeanAcc = colMeans(masterAcc)
MeanAcc
## [1] 0.8
masterAcc <- data.frame(masterAcc)
masterAcc$TestNumber <- 1:nrow(masterAcc)
max(masterAcc$masterAcc)
## [1] 0.8846154
min(masterAcc$masterAcc)
## [1] 0.6923077
ggplot(masterAcc, aes(x=seq(1,100,1), y=masterAcc)) +
geom_line(color="darkslateblue") +
ggtitle("Accuracy of 100 Train/Test Sets") +
xlab("Number of Train/Test Sets") +
ylab("Accuracy") +
theme(plot.title = element_text(hjust = 0.5))
# sensitivity measurements
MeanSens = colMeans(masterSens)
MeanSens
## [1] 0.8387946
masterSens <- data.frame(masterSens)
masterSens$TestNumber <- 1:nrow(masterSens)
max(masterSens$masterSens)
## [1] 0.9528302
min(masterSens$masterSens)
## [1] 0.7272727
ggplot(masterSens, aes(x=seq(1,100,1), y=masterSens)) +
geom_line(color="darkslateblue") +
ggtitle("Sensitivity of 100 Train/Test Sets") +
xlab("Number of Train/Test Sets") +
ylab("Sensitivity") +
theme(plot.title = element_text(hjust = 0.5))
# specificity measurements
MeanSpec = colMeans(masterSpec)
MeanSpec
## [1] 0.6028654
masterSpec <- data.frame(masterSpec)
masterSpec$TestNumber <- 1:nrow(masterSpec)
max(masterSpec$masterSpec)
## [1] 0.8666667
min(masterSpec$masterSpec)
## [1] 0.3636364
ggplot(masterSpec, aes(x=seq(1,100,1), y=masterSpec)) +
geom_line(color="darkslateblue") +
ggtitle("Specificity of 100 Train/Test Sets") +
xlab("Number of Train/Test Sets") +
ylab("Specificity") +
theme(plot.title = element_text(hjust = 0.5))
# plot together
masterAcc$Value <- masterAcc$masterAcc
masterAcc$Test <- "Accuracy"
masterSpec$Value <- masterSpec$masterSpec
masterSpec$Test <- "Specificity"
masterSens$Value <- masterSens$masterSens
masterSens$Test <- "Sensitivity"
NB.df <- rbind(masterAcc[,c(2,3,4)],masterSens[,c(2,3,4)],masterSpec[,c(2,3,4)])
ggplot(NB.df, aes(x=TestNumber, y=Value, linetype=Test, colors=Test)) +
geom_line(color = "darkslateblue") +
ggtitle("Accuracy, Sensitivity, & Specificity of 100 Train/Test Sets") +
xlab("Number of Train/Test Sets") +
ylab("Statistical Values") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = Rcolors)
kNNsplitPerc <- .7
kNNtrainIndices <- sample(1:dim(employee.df)[1],round(kNNsplitPerc * dim(employee.df)[1]))
kNNdfTrain <- employee.df[kNNtrainIndices,]
kNNdfTest <- employee.df[-kNNtrainIndices,]
# Internal Model
classifications <- knn.cv(employee.df[,c(2,35)],employee.df$Attrition, k = 20)
confusionMatrix(table(classifications,employee.df$Attrition))
## Confusion Matrix and Statistics
##
##
## classifications No Yes
## No 724 129
## Yes 6 11
##
## Accuracy : 0.8448
## 95% CI : (0.819, 0.8683)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 0.3422
##
## Kappa : 0.1091
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99178
## Specificity : 0.07857
## Pos Pred Value : 0.84877
## Neg Pred Value : 0.64706
## Prevalence : 0.83908
## Detection Rate : 0.83218
## Detection Prevalence : 0.98046
## Balanced Accuracy : 0.53518
##
## 'Positive' Class : No
##
# External Model
classifications1 <- knn(kNNdfTrain[,c(2,35)], kNNdfTest[,c(2,35)], kNNdfTrain$Attrition,
prob = TRUE, k = 10)
confusionMatrix(table(classifications1,kNNdfTest$Attrition))
## Confusion Matrix and Statistics
##
##
## classifications1 No Yes
## No 218 36
## Yes 3 4
##
## Accuracy : 0.8506
## 95% CI : (0.8014, 0.8915)
## No Information Rate : 0.8467
## P-Value [Acc > NIR] : 0.4737
##
## Kappa : 0.1305
##
## Mcnemar's Test P-Value : 2.99e-07
##
## Sensitivity : 0.9864
## Specificity : 0.1000
## Pos Pred Value : 0.8583
## Neg Pred Value : 0.5714
## Prevalence : 0.8467
## Detection Rate : 0.8352
## Detection Prevalence : 0.9732
## Balanced Accuracy : 0.5432
##
## 'Positive' Class : No
##
# Train and Test for Linear Regression
set.seed(9)
lmsplitPerc <- .8
trainIndices <- sample(1:dim(employee.df)[1],round(lmsplitPerc * dim(employee.df)[1]))
dfTrain <- employee.df[trainIndices,]
dfTest <- employee.df[-trainIndices,]
dfTrain <- na.omit(dfTrain)
dfTest <- na.omit(dfTest)
# multiple variable linear regression with train and test
Model1_fit = lm(MonthlyIncome~Age+JobLevel+JobRole, data = dfTrain)
summary(Model1_fit)
##
## Call:
## lm(formula = MonthlyIncome ~ Age + JobLevel + JobRole, data = dfTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3613.5 -719.0 -4.9 630.4 4012.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -269.283 290.067 -0.928 0.353556
## Age 11.298 5.424 2.083 0.037602 *
## JobLevel 2995.928 79.096 37.877 < 2e-16 ***
## JobRoleHuman Resources -447.969 294.175 -1.523 0.128271
## JobRoleLaboratory Technician -754.446 198.958 -3.792 0.000163 ***
## JobRoleManager 3839.944 265.368 14.470 < 2e-16 ***
## JobRoleManufacturing Director 35.227 190.876 0.185 0.853633
## JobRoleResearch Director 3845.594 252.323 15.241 < 2e-16 ***
## JobRoleResearch Scientist -405.396 200.443 -2.022 0.043513 *
## JobRoleSales Executive -240.793 171.281 -1.406 0.160226
## JobRoleSales Representative -627.789 246.268 -2.549 0.011013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1102 on 685 degrees of freedom
## Multiple R-squared: 0.9433, Adjusted R-squared: 0.9425
## F-statistic: 1140 on 10 and 685 DF, p-value: < 2.2e-16
#show residuals
hist(Model1_fit$residuals, col = "darkslateblue", main = "Histogram of Residuals")
sqrt(sum((Model1_fit$residuals)^2))
## [1] 28853.5
# predict with this model
preds2 <- predict(Model1_fit, newdata = dfTest)
# assess model
MSPE = data.frame(Observed = dfTest$MonthlyIncome, Predicted = preds2)
MSPE$Resisdual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Resisdual^2
mean(MSPE$SquaredResidual)
## [1] 992086.6
# multiple variable linear regression
Model2_fit = lm(MonthlyIncome~Age+JobLevel+JobRole, data = employee.df)
summary(Model2_fit)
##
## Call:
## lm(formula = MonthlyIncome ~ Age + JobLevel + JobRole, data = employee.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3639.3 -671.7 -46.5 634.8 4097.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -564.822 249.203 -2.267 0.02367 *
## Age 13.701 4.729 2.898 0.00386 **
## JobLevel 3032.839 69.606 43.571 < 2e-16 ***
## JobRoleHuman Resources -324.725 255.782 -1.270 0.20459
## JobRoleLaboratory Technician -526.524 171.357 -3.073 0.00219 **
## JobRoleManager 3968.183 232.376 17.077 < 2e-16 ***
## JobRoleManufacturing Director 85.832 169.605 0.506 0.61294
## JobRoleResearch Director 3941.604 217.548 18.118 < 2e-16 ***
## JobRoleResearch Scientist -249.221 171.560 -1.453 0.14668
## JobRoleSales Executive -127.854 146.073 -0.875 0.38167
## JobRoleSales Representative -404.623 215.498 -1.878 0.06077 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1080 on 859 degrees of freedom
## Multiple R-squared: 0.9455, Adjusted R-squared: 0.9448
## F-statistic: 1490 on 10 and 859 DF, p-value: < 2.2e-16
#show residuals
hist(Model2_fit$residuals, col = "darkslateblue", main = "Histogram of Residuals")
sqrt(sum((Model2_fit$residuals)^2))
## [1] 31647.6
# predict with this model
preds2 <- predict(Model2_fit, newdata = NoSalary)
preds2 <- as.data.frame(preds2)
NoSalary$Salary <- preds2[,1]
# plot predicted values
NoSalary %>% ggplot(aes(x = Age, y = Salary)) +
geom_point(color = "darkslateblue") +
ggtitle("Predicted Salary by Age") +
ylab("Salary") +
theme(plot.title = element_text(hjust = 0.5))
Salary.Classify = NoSalary %>% select("ID", "Salary") %>% arrange(ID)
#write.csv(Salary.Classify, file = "Case2PredictionsBoyd_Pandey Salary.csv", row.names=FALSE)